import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split,\
cross_val_score, KFold, StratifiedShuffleSplit
from sklearn import metrics
from sklearn.feature_selection import RFE
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
import matplotlib.colors
from pylab import rcParams
from matplotlib.font_manager import FontProperties
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
plt.style.use('seaborn-whitegrid')
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (17, 6)
mpl.rcParams['axes.labelsize'] = 14
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze a weather dataset from Kaggle.com.
Data description from Kaggle:
def Header(Text, L = 100, C = 'Blue', T = 'White'):
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(BACK[C] + FORE[T] + Style.NORMAL + Text + Style.RESET_ALL + ' ' + FORE[C] +
Style.NORMAL + (L- len(Text) - 1)*'=' + Style.RESET_ALL)
def Line(L=100, C = 'Blue'):
FORE = {'Black': Fore.BLACK, 'Red':Fore.RED, 'Green':Fore.GREEN, 'Yellow':Fore.YELLOW, 'Blue':Fore.BLUE,
'Magenta':Fore.MAGENTA, 'Cyan':Fore.CYAN, 'White': Fore.WHITE}
print(FORE[C] + Style.NORMAL + L*'=' + Style.RESET_ALL)
def Search_List(Key, List): return [s for s in List if Key in s]
Target = 'Relative Humidity (Afternoon)'
Path = 'weatherdata/daily_weather.csv'
Data = pd.read_csv(Path.split(".")[0]+'_STD.csv')
Header('Standardized Dataset:')
display(Data.head(10).style.hide_index().set_precision(2))
Dataset_features = pd.read_csv(Path.split("/")[0] + '/dataset_features.csv', sep = ';')
Header('Feature Description:')
display(Dataset_features.style.hide_index())
Standardized Dataset: ==============================================================================
| Air Pressure | Air Temperature | Avg Wind Direction | Avg Wind Speed | Max Wind Direction | Max Wind Speed | Rain Accumulation | Rain Duration | Relative Humidity (Morning) | Relative Humidity (Afternoon) |
|---|---|---|---|---|---|---|---|---|---|
| -0.26 | 0.89 | 1.87 | -0.75 | 2.18 | -0.74 | -0.13 | -0.18 | 0.32 | 36.16 |
| -0.48 | 0.58 | -0.58 | -0.67 | -0.13 | -0.62 | -0.13 | -0.18 | -0.39 | 19.43 |
| 1.31 | -0.39 | -1.32 | 2.54 | -1.27 | 2.70 | -0.13 | -0.17 | -1.00 | 14.46 |
| 0.51 | 0.47 | 0.82 | -0.26 | 0.93 | -0.33 | -0.13 | -0.18 | -0.87 | 12.74 |
| 0.72 | -1.85 | 1.97 | -0.80 | -0.19 | -0.74 | 5.47 | 9.05 | 2.28 | 76.74 |
| -1.13 | 1.21 | 0.59 | 0.97 | 0.60 | 0.71 | -0.12 | -0.08 | 0.03 | 33.93 |
| -1.03 | 0.46 | 0.52 | -0.39 | 0.56 | -0.44 | -0.13 | -0.18 | -0.93 | 21.39 |
| -0.26 | -1.19 | 1.45 | -0.66 | 1.83 | -0.60 | -0.13 | -0.18 | 1.82 | 74.92 |
| 0.38 | 1.40 | -1.47 | -0.22 | -1.28 | -0.20 | -0.13 | -0.18 | -0.18 | 24.03 |
| -1.22 | -1.56 | 0.30 | -0.12 | 0.70 | -0.08 | -0.13 | -0.18 | 2.14 | 68.05 |
Feature Description: ===============================================================================
| Feature | Description |
|---|---|
| Air Pressure | Air pressure StartFragment in hectopascal (100 pascals) at 9 AM |
| Air Temperature | Air temperature in degrees Fahrenheit at 9 AM |
| Avg Wind Direction | Average wind direction over the minute before the timestamp in degrees (0 starts from the north) at 9 AM |
| Avg Wind Speed | Average wind speed over the minute before the timestamp in meter per seconds (m/s) at 9 AM |
| Max Wind Direction | Highest wind direction in the minute before the timestamp in degrees (0 starts from the north) at 9 AM |
| Max Wind Speed | Highest wind speed in the minute before the timestamp in meter per seconds (m/s) at 9 AM |
| Min Wind Speed | Smallest wind speed in the minute before the timestamp in meter per seconds (m/s) at 9 AM |
| Rain Accumulation | Accumulated rain in millimeters (mm) at 9 AM |
| Rain Duration | Length of time rain in seconds (s) at 9 AM |
| Relative Humidity (Morning) | Relative humidity in percentage in at 9 AM |
| Relative Humidity (Afternoon) | Relative humidity in percentage at 3 PM |
Let's set Relative Humidity (Afternoon) as the target variable. This means given the dataset and using the rest of the features, we would like to know whether is humid or not at 3 PM. In doing so, define a Humidity Level (Afternoon) feature as follows:
$$\text{Humidity Level (Afternoon)} = \begin{cases} 0 &\mbox{Very Low} \\ 1 &\mbox{Low} \\ 2 &\mbox{Medium} \\ 3 &\mbox{High} \end{cases}$$N = 10
Target = 'Humidity Level (Afternoon)'
Data[Target], bins = pd.qcut(Data['Relative Humidity (Afternoon)'], precision =2,
retbins= True, q=N, labels=np.arange(0, N, 1))
df = Data.drop(columns = ['Relative Humidity (Afternoon)'])
Labels_dict = dict(list(enumerate(['(%.2f, %.2f]' % (bins[i], bins[i+1]) for i in range(N)])))
del bins
We can visualize the data using Parallel Coordinates.
Temp = df.copy()
Temp[Target] = Temp[Target].map(Labels_dict)
Temp = Temp.sort_values(by = [Target])
fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (15, 8))
_ = pd.plotting.parallel_coordinates(Temp, class_column = Target, ax = ax,
color=["#3498db", "#e74c3c", "#34495e", "#2ecc71"], axvlines = True)
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize = 12)
_ = ax.legend(title = Target, loc="upper left", fontsize = 12)
_ = ax.set_ylim([-5,20])
_ = ax.axvline(linewidth=1, color='gray')
_ = ax.set_title('Daily Weather Dataset Parallel Coordinates', weight='bold', fontsize = 14)
del Temp
However, the results of this visualization can be improved if a clustering method is used. For this reason, we K-Means clustering method.
kmeans = KMeans(n_clusters = N)
Temp = df.drop(columns = Target)
model = kmeans.fit(Temp)
Out = pd.DataFrame(model.cluster_centers_, columns = Temp.columns.tolist())
Out[Target] = np.sort(df[Target].unique().tolist())
display(Out.style.hide_index().set_precision(4))
Temp = Out.copy()
Temp[Target] = Temp[Target].map(Labels_dict)
fig, ax = plt.subplots(nrows=1, ncols=1, figsize = (15.5, 8))
_ = pd.plotting.parallel_coordinates(Temp, class_column = Target, ax = ax,
color=["#3498db", "#e74c3c", "#34495e", "#2ecc71"], axvlines = True)
_ = ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize = 12)
_ = ax.legend(title = Target, loc="upper left", fontsize = 12)
_ = ax.set_ylim([-5, 15])
_ = ax.set_title('Daily Weather Dataset (with %i folds) Parallel Coordinates' % N, weight='bold', fontsize = 14)
| Air Pressure | Air Temperature | Avg Wind Direction | Avg Wind Speed | Max Wind Direction | Max Wind Speed | Rain Accumulation | Rain Duration | Relative Humidity (Morning) | Humidity Level (Afternoon) |
|---|---|---|---|---|---|---|---|---|---|
| 0.6723 | -0.3025 | -1.1107 | -0.2586 | -1.1035 | -0.2472 | -0.1181 | -0.1517 | -0.4330 | 0 |
| 0.6930 | 0.0545 | 0.7881 | -0.4007 | 0.8780 | -0.4012 | -0.1185 | -0.1275 | -0.4886 | 1 |
| -0.6545 | -1.5228 | 0.2211 | 0.1693 | -0.0038 | 0.3049 | 3.3110 | 6.8552 | 1.8983 | 2 |
| 1.0771 | -0.3424 | -1.3204 | 2.5864 | -1.2033 | 2.6510 | -0.1195 | -0.1547 | -0.8016 | 3 |
| -0.9441 | -1.4692 | 0.6439 | 1.0644 | 0.6870 | 0.9381 | -0.0189 | 0.0767 | 1.6000 | 4 |
| 0.7705 | -0.0609 | -1.2616 | 1.1511 | -1.1154 | 1.1888 | -0.1273 | -0.1796 | -0.6032 | 5 |
| -0.7057 | 0.8091 | 0.6947 | -0.4831 | 0.6724 | -0.5064 | -0.1144 | -0.1553 | -0.1982 | 6 |
| -0.5894 | -1.0926 | 0.6512 | -0.4480 | 0.7315 | -0.4563 | -0.0708 | 0.0392 | 1.6633 | 7 |
| -0.3706 | 1.0422 | -0.5534 | -0.6339 | -0.8728 | -0.6137 | -0.1259 | -0.1825 | -0.4069 | 8 |
| -0.6955 | -1.9717 | 1.4712 | 0.1202 | 1.2781 | 0.1169 | 12.0807 | 6.1127 | 2.1548 | 9 |
def DatasetTargetDist(Inp, Target, Labels_dict, PD):
# Table
Table = Inp[Target].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
PD = dict(PieColors = px.colors.sequential.Plasma_r, TableColors = ['Navy','White'], hole = .4,
column_widths=[0.6, 0.4],textfont = 14, height = 550, tablecolumnwidth = [0.32, 0.15, 0.15],
pull = [.1,.01 ,.01 ,.01], legend_title = Target)
DatasetTargetDist(Data, Target, Labels_dict, PD)
StratifiedKFold is a variation of k-fold which returns stratified folds: each set contains approximately the same percentage of samples of each target class as the complete set.
X = Data.drop(columns = [Target])
y = Data[Target].astype(int)
Test_Size = 0.3
sss = StratifiedShuffleSplit(n_splits=1, test_size=Test_Size, random_state=42)
_ = sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
# X
if isinstance(X, pd.DataFrame):
X_train, X_test = X.loc[train_index], X.loc[test_index]
else:
X_train, X_test = X[train_index], X[test_index]
# y
if isinstance(y, pd.Series):
y_train, y_test = y[train_index], y[test_index]
else:
y_train, y_test = y[train_index], y[test_index]
del sss
def Train_Test_Dist(X_train, y_train, X_test, y_test, Labels_dict, PD):
def ToSeries(x):
if not isinstance(x, pd.Series):
Out = pd.Series(x)
else:
Out = x.copy()
return Out
fig = make_subplots(rows=1, cols=3, horizontal_spacing = 0.02, column_widths= [0.25, 0.3, 0.3],
specs=[[{"type": "table"},{'type':'domain'}, {'type':'domain'}]])
# Right
C = 2
for y in [y_train, y_test]:
Table = ToSeries(y).value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Target})
Table[Target] = Table[Target].replace(Labels_dict)
fig.add_trace(go.Pie(labels= Table[Target], values= Table['Count'], pull=PD['pull'], textfont=dict(size=PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=C)
fig.update_traces(hole= PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation="v"), legend_title_text= PD['legend_title'])
C+=1
# Left
# Table
Table = pd.DataFrame(data={'Set':['X_train','X_test','y_train','y_test'],
'Shape':[X_train.shape, X_test.shape, y_train.shape, y_test.shape]}).astype(str)
T = Table.copy()
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= TableColors[0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = [0.15, 0.25],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(annotations=[dict(text= '<b>' + 'Train<br>Set' + '<b>', x=0.475, y=0.5, font_size=14, showarrow=False),
dict(text= '<b>' + 'Test<br>Set' + '<b>', x=0.86, y=0.5, font_size=14, showarrow=False)])
fig.update_layout(title={'text': '<b>' + 'Train and Test Sets' + '<b>', 'x':0.5,
'y':0.90, 'xanchor': 'center', 'yanchor': 'top'})
if not PD['height'] == None:
fig.update_layout(height = PD['height'])
fig.show()
PD = dict(PieColors = px.colors.sequential.Plasma_r, TableColors = ['Navy','White'], hole = .4,
textfont = 12, height = 400, pull = [.1,.01 ,.01 ,.01], legend_title = Target)
Train_Test_Dist(X_train, y_train, X_test, y_test,Labels_dict ,PD)
A random forest classifier (RFC) fits several decision tree classifiers on (using sub-samples of the dataset) and then averages them to improve the predictive accuracy. See sklearn.ensemble.RandomForestClassifier for more details.
Labels = list(Labels_dict.values())
def Best_Parm(model, param_dist, Top = None, X = X, y = y, n_splits = 20, scoring = 'precision', H = 600, titleY = .95):
grid = RandomizedSearchCV(estimator = model, param_distributions = param_dist,
cv = StratifiedShuffleSplit(n_splits=n_splits, test_size=Test_Size, random_state=42),
n_iter = int(1e3), scoring = scoring, error_score = 0, verbose = 0,
n_jobs = 10, return_train_score = True)
_ = grid.fit(X, y)
Table = Grid_Table(grid)
if Top == None:
Top = Table.shape[0]
Table = Table.iloc[:Top,:]
# Table
T = Table.copy()
T['Train Score'] = T['Mean Train Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Train Score'].map(lambda x: ('%.2e' % x))
T['Test Score'] = T['Mean Test Score'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Test Score'].map(lambda x: ('%.2e' % x))
T['Fit Time'] = T['Mean Fit Time'].map(lambda x: ('%.2e' % x))+ ' ± ' +T['STD Fit Time'].map(lambda x: ('%.2e' % x))
T = T.drop(columns = ['Mean Train Score','STD Train Score','Mean Test Score','STD Test Score','Mean Fit Time','STD Fit Time'])
display(T.head(Top).style.hide_index().background_gradient(subset= ['Rank Test Score'],
cmap=sns.diverging_palette(145, 300, s=60, as_cmap=True)).\
set_properties(subset=['Params'], **{'background-color': 'Indigo', 'color': 'White'}).\
set_properties(subset=['Train Score'], **{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Test Score'], **{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Fit Time'], **{'background-color': 'Linen', 'color': 'Black'}))
# Plot
Grid_Performance_Plot(Table, n_splits = n_splits, H = H, titleY = titleY)
return grid
def Grid_Table(grid):
Table = pd.DataFrame({'Rank Test Score': grid.cv_results_['rank_test_score'],
'Params':[str(s).replace('{', '').replace('}', '').\
replace("'", '') for s in grid.cv_results_['params']],
# Train
'Mean Train Score': grid.cv_results_['mean_train_score'],
'STD Train Score': grid.cv_results_['std_train_score'],
# Test
'Mean Test Score': grid.cv_results_['mean_test_score'],
'STD Test Score': grid.cv_results_['std_test_score'],
# Fit time
'Mean Fit Time': grid.cv_results_['mean_fit_time'],
'STD Fit Time': grid.cv_results_['std_fit_time']})
Table = Table.sort_values('Rank Test Score').reset_index(drop = True)
return Table
def Grid_Performance_Plot(Table, n_splits, H = 550, titleY =.95):
Temp = Table['Mean Train Score']-Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']-Table['STD Test Score'])
L = np.floor((Temp*100- Temp)).min()/100
Temp = Table['Mean Train Score']+Table['STD Train Score']
Temp = np.append(Temp, Table['Mean Test Score']+Table['STD Test Score'])
R = np.ceil((Temp*100 + Temp)).max()/100
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
subplot_titles=('<b>' + 'Train Set' + '<b>', '<b>' + 'Test Set' + '<b>'))
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Train Score'], showlegend=False, marker_color= 'SeaGreen',
error_y=dict(type='data',array=Table['STD Train Score'], visible=True)), 1, 1)
fig.add_trace(go.Scatter(x= Table['Params'], y= Table['Mean Test Score'], showlegend=False, marker_color= 'RoyalBlue',
error_y=dict(type='data',array= Table['STD Test Score'], visible=True)), 1, 2)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= [L, R])
fig.update_yaxes(title_text="Mean Score", row=1, col=1)
fig.update_layout(plot_bgcolor= 'white', width = 980, height = H,
title={'text': '<b>' + 'RandomizedSearchCV with %i-fold cross validation' % n_splits + '<b>',
'x':0.5, 'y':titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def Stratified_CV_Scoring(model, X = X, y = y, n_splits = 10):
sss = StratifiedShuffleSplit(n_splits = n_splits, test_size=Test_Size, random_state=42)
if isinstance(X, pd.DataFrame):
X = X.values
if isinstance(y, pd.Series):
y = y.values
_ = sss.get_n_splits(X, y)
Reports_Train = []
Reports_Test = []
CM_Train = []
CM_Test = []
for train_index, test_index in sss.split(X, y):
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
_ = model.fit(X_train,y_train)
# Train
y_pred = model.predict(X_train)
R = pd.DataFrame(metrics.classification_report(y_train, y_pred, target_names=Labels, output_dict=True)).T
Reports_Train.append(R.values)
CM_Train.append(metrics.confusion_matrix(y_train, y_pred))
# Test
y_pred = model.predict(X_test)
R = pd.DataFrame(metrics.classification_report(y_test, y_pred, target_names=Labels, output_dict=True)).T
Reports_Test.append(R.values)
CM_Test.append(metrics.confusion_matrix(y_test, y_pred))
# Train
ALL = Reports_Train[0].ravel()
CM = CM_Train[0].ravel()
for i in range(1, len(Reports_Train)):
ALL = np.vstack((ALL, Reports_Train[i].ravel()))
CM = np.vstack((CM, CM_Train[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Train = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Train = CM.mean(axis = 0).reshape(CM_Train[0].shape).round(0).astype(int)
del ALL, Mean, STD
# Test
ALL = Reports_Test[0].ravel()
CM = CM_Test[0].ravel()
for i in range(1, len(Reports_Test)):
ALL = np.vstack((ALL, Reports_Test[i].ravel()))
CM = np.vstack((CM, CM_Test[i].ravel()))
Mean = pd.DataFrame(ALL.mean(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
STD = pd.DataFrame(ALL.std(axis = 0).reshape(R.shape), index = R.index, columns = R.columns)
Reports_Test = Mean.applymap(lambda x: ('%.4f' % x))+ ' ± ' +STD.applymap(lambda x: ('%.4f' % x))
CM_Test = CM.mean(axis = 0).reshape(CM_Test[0].shape).round(0).astype(int)
del ALL, Mean, STD
Reports_Train = Reports_Train.reset_index().rename(columns ={'index': 'Train Set (CV = % i)' % n_splits})
Reports_Test = Reports_Test.reset_index().rename(columns ={'index': 'Test Set (CV = % i)' % n_splits})
return Reports_Train, Reports_Test, CM_Train, CM_Test
def Confusion_Mat(CM_Train, CM_Test, PD, n_splits = 10):
if n_splits == None:
Titles = ['Train Set', 'Test Set']
else:
Titles = ['Train Set (CV = % i)' % n_splits, 'Test Set (CV = % i)' % n_splits]
CM = [CM_Train, CM_Test]
Cmap = ['Greens', 'YlGn','Blues', 'PuBu']
for i in range(2):
fig, ax = plt.subplots(1, 2, figsize= PD['FS'])
fig.suptitle(Titles[i], weight = 'bold', fontsize = 16)
_ = sns.heatmap(CM[i], annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i], ax = ax[0],
linewidths = 0.2, cbar_kws={"shrink": PD['shrink']})
_ = ax[0].set_title('Confusion Matrix');
Temp = np.round(CM[i].astype('float') / CM[i].sum(axis=1)[:, np.newaxis], 2)
_ = sns.heatmap(Temp,
annot=True, annot_kws={"size": PD['annot_kws']}, cmap=Cmap[2*i+1], ax = ax[1],
linewidths = 0.4, vmin=0, vmax=1, cbar_kws={"shrink": PD['shrink']})
_ = ax[1].set_title('Normalized Confusion Matrix');
for a in ax:
_ = a.set_xlabel('Predicted labels')
_ = a.set_ylabel('True labels');
_ = a.xaxis.set_ticklabels(Labels, rotation=PD['tick_angle'], fontsize = PD['tick_fontsize'])
_ = a.yaxis.set_ticklabels(Labels, rotation=PD['tick_angle'], fontsize = PD['tick_fontsize'])
_ = a.set_aspect(1)
Header('Random Forest Classifier with Default Parameters')
n_splits = 20
RFC= RandomForestClassifier()
print('Default Parameters = %s' % RFC.get_params(deep=True))
_ = RFC.fit(X_train, y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(RFC, X = X, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'RoyalBlue', 'color': 'White'}))
Line()
PD = dict(FS = (18, 8), annot_kws = 12, shrink = .6, tick_angle = 45, tick_fontsize = 11)
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
Random Forest Classifier with Default Parameters =================================================== Default Parameters = {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| (5.30, 13.25] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| (13.25, 16.14] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.2000 ± 0.4000 |
| (16.14, 18.38] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| (18.38, 21.09] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.1500 ± 0.3571 |
| (21.09, 24.38] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| (24.38, 36.95] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.2500 ± 0.4330 |
| (36.95, 47.30] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.2000 ± 0.4000 |
| (47.30, 56.05] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| (56.05, 68.52] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.2000 ± 0.4000 |
| (68.52, 92.25] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| accuracy | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 |
| macro avg | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 766.0000 ± 0.0000 |
| weighted avg | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 766.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| (5.30, 13.25] | 0.9912 ± 0.0162 | 0.9909 ± 0.0139 | 0.9909 ± 0.0100 | 33.0000 ± 0.0000 |
| (13.25, 16.14] | 0.9635 ± 0.0280 | 0.9711 ± 0.0339 | 0.9666 ± 0.0184 | 32.8000 ± 0.4000 |
| (16.14, 18.38] | 0.9534 ± 0.0362 | 0.9591 ± 0.0410 | 0.9554 ± 0.0263 | 33.0000 ± 0.0000 |
| (18.38, 21.09] | 0.9697 ± 0.0269 | 0.9391 ± 0.0441 | 0.9533 ± 0.0230 | 32.8500 ± 0.3571 |
| (21.09, 24.38] | 0.9690 ± 0.0305 | 0.9742 ± 0.0387 | 0.9712 ± 0.0283 | 33.0000 ± 0.0000 |
| (24.38, 36.95] | 0.9597 ± 0.0405 | 0.9802 ± 0.0221 | 0.9694 ± 0.0260 | 32.7500 ± 0.4330 |
| (36.95, 47.30] | 0.9699 ± 0.0324 | 0.9436 ± 0.0363 | 0.9559 ± 0.0245 | 32.8000 ± 0.4000 |
| (47.30, 56.05] | 0.9589 ± 0.0265 | 0.9712 ± 0.0310 | 0.9646 ± 0.0206 | 33.0000 ± 0.0000 |
| (56.05, 68.52] | 0.9676 ± 0.0260 | 0.9878 ± 0.0177 | 0.9775 ± 0.0186 | 32.8000 ± 0.4000 |
| (68.52, 92.25] | 1.0000 ± 0.0000 | 0.9758 ± 0.0227 | 0.9876 ± 0.0117 | 33.0000 ± 0.0000 |
| accuracy | 0.9693 ± 0.0076 | 0.9693 ± 0.0076 | 0.9693 ± 0.0076 | 0.9693 ± 0.0076 |
| macro avg | 0.9703 ± 0.0074 | 0.9693 ± 0.0076 | 0.9692 ± 0.0076 | 329.0000 ± 0.0000 |
| weighted avg | 0.9703 ± 0.0074 | 0.9693 ± 0.0076 | 0.9692 ± 0.0076 | 329.0000 ± 0.0000 |
====================================================================================================
df = pd.DataFrame()
for n in range(4, X.shape[1]+1):
selector = RFE(estimator= RFC, n_features_to_select=n, verbose=0)
selector.fit(X_train, y_train)
df = df.append({'Number of Features to Select': n,
'Train F1 Score': metrics.f1_score(y_train, selector.predict(X_train), average = 'macro'),
'Test F1 Score': metrics.f1_score(y_test, selector.predict(X_test), average = 'macro'),
'Train Recall Score': metrics.recall_score(y_train, selector.predict(X_train), average = 'macro'),
'Test Recall Score': metrics.recall_score(y_test, selector.predict(X_test), average = 'macro'),
'Best Features':X.columns[selector.ranking_ == 1].tolist()}, ignore_index=True)
df['Number of Features to Select'] = df['Number of Features to Select'].astype(int)
df = df.sort_values(by = ['Test Recall Score', 'Test F1 Score'], ascending=False)
Best_Features = df['Best Features'][0]
display(df.style.hide_index().set_precision(4).\
set_properties(subset=['Best Features'], **{'background-color': 'Lavender', 'color': 'Black'}))
| Best Features | Number of Features to Select | Test F1 Score | Test Recall Score | Train F1 Score | Train Recall Score |
|---|---|---|---|---|---|
| ['Air Pressure', 'Air Temperature', 'Relative Humidity (Morning)', 'Relative Humidity (Afternoon)'] | 4 | 0.9847 | 0.9847 | 1.0000 | 1.0000 |
| ['Air Pressure', 'Air Temperature', 'Max Wind Direction', 'Relative Humidity (Morning)', 'Relative Humidity (Afternoon)'] | 5 | 0.9847 | 0.9847 | 1.0000 | 1.0000 |
| ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Rain Duration', 'Relative Humidity (Morning)', 'Relative Humidity (Afternoon)'] | 9 | 0.9816 | 0.9815 | 1.0000 | 1.0000 |
| ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Rain Accumulation', 'Rain Duration', 'Relative Humidity (Morning)', 'Relative Humidity (Afternoon)'] | 10 | 0.9785 | 0.9785 | 1.0000 | 1.0000 |
| ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)', 'Relative Humidity (Afternoon)'] | 7 | 0.9724 | 0.9724 | 1.0000 | 1.0000 |
| ['Air Pressure', 'Air Temperature', 'Avg Wind Direction', 'Avg Wind Speed', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)', 'Relative Humidity (Afternoon)'] | 8 | 0.9695 | 0.9695 | 1.0000 | 1.0000 |
| ['Air Pressure', 'Air Temperature', 'Max Wind Direction', 'Max Wind Speed', 'Relative Humidity (Morning)', 'Relative Humidity (Afternoon)'] | 6 | 0.9696 | 0.9694 | 1.0000 | 1.0000 |
Header('Random Forest Classifier with the Best Parameters and Feature Ranking')
RFC= RandomForestClassifier()
print('Default Parameters = %s' % RFC.get_params(deep=True))
_ = RFC.fit(X_train[Best_Features], y_train)
Reports_Train, Reports_Test, CM_Train, CM_Test = Stratified_CV_Scoring(RFC, X = X, y = y, n_splits = n_splits)
display(Reports_Train.style.hide_index().set_properties(**{'background-color': 'HoneyDew', 'color': 'Black'}).\
set_properties(subset=['Train Set (CV = % i)' % n_splits], **{'background-color': 'SeaGreen', 'color': 'White'}))
display(Reports_Test.style.hide_index().set_properties(**{'background-color': 'Azure', 'color': 'Black'}).\
set_properties(subset=['Test Set (CV = % i)' % n_splits], **{'background-color': 'RoyalBlue', 'color': 'White'}))
Line()
PD = dict(FS = (18, 8), annot_kws = 12, shrink = .6, tick_angle = 45, tick_fontsize = 11)
Confusion_Mat(CM_Train, CM_Test, PD = PD, n_splits = None)
Random Forest Classifier with the Best Parameters and Feature Ranking ============================== Default Parameters = {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
| Train Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| (5.30, 13.25] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| (13.25, 16.14] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.2000 ± 0.4000 |
| (16.14, 18.38] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| (18.38, 21.09] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.1500 ± 0.3571 |
| (21.09, 24.38] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| (24.38, 36.95] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.2500 ± 0.4330 |
| (36.95, 47.30] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.2000 ± 0.4000 |
| (47.30, 56.05] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| (56.05, 68.52] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 76.2000 ± 0.4000 |
| (68.52, 92.25] | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 77.0000 ± 0.0000 |
| accuracy | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 |
| macro avg | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 766.0000 ± 0.0000 |
| weighted avg | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 1.0000 ± 0.0000 | 766.0000 ± 0.0000 |
| Test Set (CV = 20) | precision | recall | f1-score | support |
|---|---|---|---|---|
| (5.30, 13.25] | 0.9942 ± 0.0148 | 0.9909 ± 0.0139 | 0.9924 ± 0.0101 | 33.0000 ± 0.0000 |
| (13.25, 16.14] | 0.9594 ± 0.0318 | 0.9772 ± 0.0286 | 0.9677 ± 0.0195 | 32.8000 ± 0.4000 |
| (16.14, 18.38] | 0.9525 ± 0.0453 | 0.9530 ± 0.0338 | 0.9518 ± 0.0279 | 33.0000 ± 0.0000 |
| (18.38, 21.09] | 0.9592 ± 0.0326 | 0.9346 ± 0.0579 | 0.9454 ± 0.0337 | 32.8500 ± 0.3571 |
| (21.09, 24.38] | 0.9710 ± 0.0332 | 0.9727 ± 0.0286 | 0.9713 ± 0.0221 | 33.0000 ± 0.0000 |
| (24.38, 36.95] | 0.9718 ± 0.0266 | 0.9803 ± 0.0276 | 0.9757 ± 0.0206 | 32.7500 ± 0.4330 |
| (36.95, 47.30] | 0.9750 ± 0.0349 | 0.9513 ± 0.0377 | 0.9622 ± 0.0251 | 32.8000 ± 0.4000 |
| (47.30, 56.05] | 0.9629 ± 0.0370 | 0.9742 ± 0.0322 | 0.9678 ± 0.0221 | 33.0000 ± 0.0000 |
| (56.05, 68.52] | 0.9623 ± 0.0285 | 0.9924 ± 0.0131 | 0.9770 ± 0.0194 | 32.8000 ± 0.4000 |
| (68.52, 92.25] | 1.0000 ± 0.0000 | 0.9697 ± 0.0332 | 0.9843 ± 0.0173 | 33.0000 ± 0.0000 |
| accuracy | 0.9696 ± 0.0085 | 0.9696 ± 0.0085 | 0.9696 ± 0.0085 | 0.9696 ± 0.0085 |
| macro avg | 0.9708 ± 0.0082 | 0.9696 ± 0.0085 | 0.9696 ± 0.0086 | 329.0000 ± 0.0000 |
| weighted avg | 0.9708 ± 0.0082 | 0.9696 ± 0.0085 | 0.9696 ± 0.0086 | 329.0000 ± 0.0000 |
====================================================================================================